x
# The World Happiness Report determines the state of global happiness. # The happiness scores and rankings data has been collected by asking individuals to rank their life.# Ranking ranges from 0 (worst possible life) to 10 (best possible life). import pandas as pdimport numpy as npimport seaborn as snsimport matplotlib.pyplot as pltimport plotly.express as pximport warningswarnings.filterwarnings("ignore")from jupyterthemes import jtplotjtplot.style(theme = 'monokai', context = 'notebook', ticks = True, grid = False)# Import csv file into pandas dataframehappy_df = pd.read_csv('happiness_report.csv')happy_df# print the first 5 rows of the dataframehappy_df.head()xxxxxxxxxxhappy_df.tail(8)**PRACTICE OPPORTUNITY #1 [OPTIONAL]:** - **Select 2 countries from the dataframe and explore scores. Perform sanity check.**PRACTICE OPPORTUNITY #1 [OPTIONAL]:
xxxxxxxxxxhappy_df[ happy_df['Country or region'] == 'Canada' ]xxxxxxxxxxhappy_df[ happy_df['Country or region'] == 'Syria' ]xxxxxxxxxx# Check the number of non-null values in the dataframehappy_df.info()xxxxxxxxxx# Check Null valueshappy_df.isnull().sum()xxxxxxxxxx# Obtain the Statistical summary of the dataframehappy_df.describe()xxxxxxxxxx# check the number of duplicated entries in the dataframehappy_df.duplicated().sum() # since there are no duplicates, no further action is required**PRACTICE OPPORTUNITY #2 [OPTIONAL]:** - **Which country has the maximum happiness score? What is the perception of corruption in this country?**PRACTICE OPPORTUNITY #2 [OPTIONAL]:
xxxxxxxxxxhappy_df['Score'].max()xxxxxxxxxxhappy_df[happy_df['Score'] == 7.769]# TASK #4: PERFORM DATA VISUALIZATION: PAIRPLOT & SCATTERMATRIX# A scatterplot matrix is a matrix associated to n numerical arrays (data variables), $X_1,X_2,…,X_n$ , of the same length. # The cell (i,j) of such a matrix displays the scatter plot of the variable Xi versus Xj.# Here we show the Plotly Express function px.scatter_matrix to plot the scatter matrix for the columns of the dataframe. By default, all columns are considered.# Note:# Positive correlation between GDP and score # Positive correlation between Social Support and score fig = px.scatter_matrix(happy_df[['Score','GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']], width = 1500, height = 1500)fig.show()# Alternatively, you can use Seaborn to plot the pairplots as follows (Note that the plot is no longer interactive): fig = plt.figure(figsize = (20,20))sns.pairplot(happy_df[['Score','GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']])# Positive correlation between GDP and score # Positive correlation between Social Support and score # TASK #5: PERFORM DATA VISUALIZATION: DISTPLOT & CORRELATION MATRIXx
# distplot combines the matplotlib.hist function with seaborn kdeplot()columns = ['Score','GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']plt.figure(figsize = (20, 50))for i in range(len(columns)): plt.subplot(8, 2, i+1) sns.distplot(happy_df[columns[i]], color = 'g'); plt.title(columns[i])plt.tight_layout()xxxxxxxxxxhappy_df.corr()fig = px.imshow(happy_df.corr())fig.show()xxxxxxxxxx# Get the correlation matrixcorr_matrix = happy_df.corr()corr_matrixsns.heatmap(corr_matrix, annot = True)# TASK #6: PERFORM DATA VISUALIZATION: SCATTERPLOTS AND BUBBLE CHARTS# Plot the relationship between score, GDP and regionfig = px.scatter(happy_df, x = 'GDP per capita', y = 'Score', text = 'Country or region')fig.update_traces(textposition = 'top center')fig.update_layout(height = 1000)fig.show()# Plot the relationship between score and GDP (while adding color and size)fig = px.scatter(happy_df, x = "GDP per capita", y = "Score", size = 'Overall rank', color = "Country or region", hover_name = "Country or region")fig.update_layout(title_text = 'Happiness Score vs GDP per Capita')fig.show()# Plot the relationship between score and freedom to make life choicesfig = px.scatter(happy_df, x = 'Freedom to make life choices', y = "Score", size = 'Overall rank', color = "Country or region", hover_name = "Country or region", trendline = "ols")fig.update_layout(title_text = 'Happiness Score vs Freedom to make life choices')fig.show()Using "cars.csv" dataset included in the guided project package, please complete the following tasks: - 1. Using Pandas, read the "cars.csv" dataset- 2. Perform exploratory data analysis- 3. Remove $ sign and comma (,) from MSRP and Invoice columns- 4. Convert MSRP and Invoice columns to integer datatypes and perform sanity check on the data- 5. Plot the scattermatrix and pairplot- 6. Plot a scatterplot between 'Horsepower' and 'MSRP' while showing 'Make' as text. Use the 'Cylinders' column to display color.- 7. Plot the wordcloud of the Make column- 8. Plot the histogram of Make and Type of the car using Plotly Express- 9. Find out which manufacturer has high number of Sports type - 10. Find out which manufacturers has Hybrids- 11. Plot the correlation matrix using plotly express and Seaborn- 12. Comment on the correlation matrix, which feature has the highest positive correlation with MSRP?Using "cars.csv" dataset included in the guided project package, please complete the following tasks:
import pandas as pdimport numpy as npimport plotly.express as pximport matplotlib as pltimport seaborn as snscars_df = pd.read_csv('cars.csv')cars_dfxxxxxxxxxxcars_df.info()xxxxxxxxxxcars_df.describe()cars_df["MSRP"] = cars_df["MSRP"].str.replace("$", "")cars_df["MSRP"] = cars_df["MSRP"].str.replace(",", "")cars_df["MSRP"] = cars_df["MSRP"].astype(int)cars_df["MSRP"]cars_df["Invoice"] = cars_df["Invoice"].str.replace("$", "")cars_df["Invoice"] = cars_df["Invoice"].str.replace(",", "")cars_df["Invoice"] = cars_df["Invoice"].astype(int)cars_df.head()fig = px.scatter_matrix(cars_df, width = 1500, height = 1500)fig.show()sns.pairplot(data = cars_df) fig = px.scatter(cars_df, x = 'Horsepower', y = 'MSRP', text = 'Make', color = 'Cylinders', hover_name = 'Cylinders')fig.update_traces(textposition = "top center")fig.show()cars_df.Make.unique()x
fig = px.histogram(cars_df, x = 'Make', title = 'Car Makers')fig.show()fig = px.histogram(cars_df, x = 'Type', title = 'Car Models', color = 'Make', height = 700, width = 900)fig.show()cars_df.head()xxxxxxxxxxcars_df[cars_df['Type'] == 'Sports']#using plotlycars_df.corr()fig = px.imshow(cars_df.corr(), height = 700)fig.show()xxxxxxxxxx#using seaborncorr_matrix = cars_df.corr()corr_matrixsns.heatmap(corr_matrix, annot = True)x
#9. Porsche#10. Honda and Toyota#12. Invoice has the highest positive correlation with MSRP of 1**PRACTICE OPPORTUNITY #1 SOLUTION:**- **Select 2 countries from the dataframe and explore scores. Perform sanity check.**PRACTICE OPPORTUNITY #1 SOLUTION:
xxxxxxxxxxhappy_df[happy_df['Country or region']=='Canada']xxxxxxxxxxhappy_df[happy_df['Country or region']=='Zimbabwe']**PRACTICE OPPORTUNITY #2 SOLUTION:**- **Which country has the maximum happiness score? What is the perception of corruption in this country?**PRACTICE OPPORTUNITY #2 SOLUTION:
xxxxxxxxxxhappy_df.describe()xxxxxxxxxxhappy_df[happy_df['Score'] == 7.769000]xxxxxxxxxxxxxxxxxxxximport numpy as np # Multi-dimensional array objectimport pandas as pd # Data Manipulationimport seaborn as sns # Data Visualizationimport matplotlib.pyplot as plt # Data Visualizationimport plotly.express as px # Interactive Data Visualizationxxxxxxxxxx# Read the CSV file car_df = pd.read_csv("cars.csv")xxxxxxxxxx# Load the top 10 instancescar_df.head(10)xxxxxxxxxx# Load the bottom 10 instances car_df.tail(10)# Display the feature columnscar_df.columns# Check if any missing values are present in the dataframecar_df.isnull().sum()xxxxxxxxxx# Obtain the summary of the dataframecar_df.info()# Convert MSRP and Invoice datatype to integer so we need to remove $ sign and comma (,) from these 2 columnscar_df["MSRP"] = car_df["MSRP"].str.replace("$", "")car_df["MSRP"] = car_df["MSRP"].str.replace(",", "")car_df["MSRP"] = car_df["MSRP"].astype(int)car_df["MSRP"]car_df["Invoice"] = car_df["Invoice"].str.replace("$", "")car_df["Invoice"] = car_df["Invoice"].str.replace(",", "")car_df["Invoice"] = car_df["Invoice"].astype(int)# Let's view the updated MSRP and Invoice Columnscar_df.head()# Display the updated summary of the dataframecar_df.info()xxxxxxxxxxfig = px.scatter_matrix(car_df, width = 2000, height = 2000)fig.show()x
# Alternatively, you can use scatterplots to show the joint relationships and histograms for univariate distributionssns.pairplot(data = car_df) xxxxxxxxxxfig = px.scatter(car_df, x = 'Horsepower', y = 'MSRP', text = car_df['Make'], color = car_df['Cylinders'])fig.update_traces(textposition = 'top center')fig.update_layout(height = 2000)fig.update_layout(width = 2000)fig.show()x
# Let's view various makes of the carscar_df.Make.unique()xxxxxxxxxxfig = px.histogram(car_df, x = "Make", labels = {"Make":"Manufacturer"}, title = "MAKE OF THE CAR", color_discrete_sequence = ["maroon"])fig.show()xxxxxxxxxx# Let's view various types of the carscar_df.Type.unique()fig = px.histogram(car_df, x = "Type", labels = {"Type":"Type"}, title = "TYPE OF THE CAR", color_discrete_sequence = ["blue"]) fig.show()xxxxxxxxxx# Let's plot the locationcar_df.Origin.unique()fig = px.histogram(car_df, x = "Origin", labels = {"Origin":"Origin"}, title = "LOCATION OF THE CAR SALES", color_discrete_sequence = ["brown"]) fig.show()xxxxxxxxxx# Let's view the drivetrain of the carscar_df.DriveTrain.unique()fig = px.histogram(car_df, x = "DriveTrain", labels = {"DriveTrain":"Drivetrain"}, title = "DRIVETRAIN OF THE CAR", color_discrete_sequence = ["BLACK"]) fig.show()# Plot the make of the car and its locationfig = px.histogram(car_df, x = "Make", color = "Origin", labels = {"Make":"Manufacturer"}, title = "MAKE OF THE CAR Vs LOCATION") fig.show()x
# Let's view the model of all used cars using WordCloud generatorfrom wordcloud import WordCloud, STOPWORDScar_dftext = car_df.Model.valuesstopwords = set(STOPWORDS)wc = WordCloud(background_color = "black", max_words = 2000, max_font_size = 100, random_state = 3, stopwords = stopwords, contour_width = 3).generate(str(text)) fig = plt.figure(figsize = (25, 15))plt.imshow(wc, interpolation = "bilinear")plt.axis("off")plt.show()xxxxxxxxxx# Obtain the correlation matrixcar_df.corr()fig = px.imshow(car_df.corr())fig.show()xxxxxxxxxxplt.figure(figsize = (8,8))sns.heatmap(car_df.corr(), cmap="YlGnBu", annot = True)xxxxxxxxxxfig = px.histogram(car_df, x = "Make", color = "Type", labels = {"Make":"Manufacturer"}, title = "MAKE AND TYPE OF THE CAR", opacity = 1) fig.show()# Porsche# Honda and Toyota# Positive correlation between engine size and number of cylinders# Positive correlation between horsepower and number of cylinders# highest positive correlation with MSRP is = horsepower